Setup and packages used when creating this document
knitr::opts_chunk$set(echo = TRUE)
#libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(plotly)
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
The data is sourced from John Hopkins University with the raw data being copied from github.
url_in = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/"
file_names = c("time_series_covid19_confirmed_US.csv",
"time_series_covid19_confirmed_global.csv",
"time_series_covid19_deaths_US.csv",
"time_series_covid19_deaths_global.csv")
pop_url_in = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/"
pop_file_name = "csse_covid_19_data/UID_ISO_FIPS_LookUp_Table.csv"
urls = str_c(url_in, file_names)
pop_url = str_c(pop_url_in, pop_file_name )
Now that my data is sourced, I read in each url into my RMD.
us_cases = read_csv(urls[1])
## Rows: 3342 Columns: 1154
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): iso2, iso3, Admin2, Province_State, Country_Region, Combined_Key
## dbl (1148): UID, code3, FIPS, Lat, Long_, 1/22/20, 1/23/20, 1/24/20, 1/25/20...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
global_cases = read_csv(urls[2])
## Rows: 289 Columns: 1147
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Province/State, Country/Region
## dbl (1145): Lat, Long, 1/22/20, 1/23/20, 1/24/20, 1/25/20, 1/26/20, 1/27/20,...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
us_deaths = read_csv(urls[3])
## Rows: 3342 Columns: 1155
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): iso2, iso3, Admin2, Province_State, Country_Region, Combined_Key
## dbl (1149): UID, code3, FIPS, Lat, Long_, Population, 1/22/20, 1/23/20, 1/24...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
global_deaths = read_csv(urls[4])
## Rows: 289 Columns: 1147
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Province/State, Country/Region
## dbl (1145): Lat, Long, 1/22/20, 1/23/20, 1/24/20, 1/25/20, 1/26/20, 1/27/20,...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
global_pop = read_csv(pop_url)
## Rows: 4321 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): iso2, iso3, FIPS, Admin2, Province_State, Country_Region, Combined_Key
## dbl (5): UID, code3, Lat, Long_, Population
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
I tidy the data by transforming date and time for extraction in analysis and visualization. I exclude irrelevant columns, like codes not found on each data set and longitudinal and latitudinal data.
#US Cases
us_cases = us_cases %>%
pivot_longer(
cols = -c(UID, iso2, iso3, code3, FIPS, Admin2, Province_State,
Country_Region, Lat, Long_, Combined_Key),
names_to = "date",
values_to = "cases"
) %>%
select(-c(UID, iso2, iso3, code3, FIPS, Lat, Long_,)) %>%
mutate(date = mdy(date))
#Global Cases
global_cases = global_cases %>%
pivot_longer(
cols = -c(`Province/State`, `Country/Region`, Lat, Long),
names_to = "date",
values_to = "cases"
) %>%
select(-c(Lat, Long)) %>%
mutate(date = mdy(date))
#US Deaths
us_deaths = us_deaths %>%
pivot_longer(
cols = -c(iso2, iso3, Admin2, Province_State, Country_Region, Combined_Key, UID, code3, FIPS, Lat, Long_, Population,),
names_to = "date",
values_to = "deaths"
) %>%
select(-c(iso2, iso3, UID, code3, FIPS, Lat, Long_,)) %>%
mutate(date = mdy(date))
#Global Deaths
global_deaths <- global_deaths %>%
pivot_longer(
cols = -c(`Province/State`, `Country/Region`, Lat, Long),
names_to = "date",
values_to = "deaths"
) %>%
select(-c(Lat, Long)) %>%
mutate(date = mdy(date))
I transform the data so that the US data set will properly join together. I do this by renaming “Country/Region” and “Province/State”. I also rename Admin2, for easier visibility on the US data and my own access. Then I create a new “combined key” and add a “population” column for the global data so that it’s uniform with the relevant columns in the US data, and I can perform better analysis with population.
#Merge global data sets
global <- global_cases %>%
full_join(
global_deaths) %>%
rename(Country_Region = `Country/Region`,
Province_State = `Province/State`) %>%
filter(cases > 0)
## Joining with `by = join_by(`Province/State`, `Country/Region`, date)`
#Merge US data sets
us = us_cases %>%
full_join(us_deaths) %>%
rename(County = Admin2) %>%
filter(cases > 0)
## Joining with `by = join_by(Admin2, Province_State, Country_Region,
## Combined_Key, date)`
#Make Combined Key column
global = global %>%
unite("Comibed_Key", c(Province_State, Country_Region),
sep = ",",
na.rm = TRUE,
remove = FALSE)
#Add population column
global = global %>%
left_join(global_pop, by = c("Province_State", "Country_Region")) %>%
select(-c(UID, FIPS)) %>%
select(Province_State, Country_Region, date, cases, deaths, Population, Combined_Key)
Here I created an interactive plot of a US map. Each state is color coded for the deaths per million as of September 9th 2023, and the user can hover their mouse over each state showing the state name, deaths per million, total deaths, and population (as of Sep. 9, 2023). Since plotly uses the abbreviation for each US state, I had to look-up and convert each state name into the two letter abbreviation with necessary manual entry for “District of Columbia”. Hover function not accessible across all platforms, but still cool.
# State abbreviations
state_abbrev <- tibble(Province_State = c(state.name, "District of Columbia"),
state_abbr = c(state.abb, "DC"))
latest_date <- max(us$date, na.rm = TRUE)
us_states_latest <- us %>%
group_by(Province_State) %>%
filter(date == latest_date) %>%
summarise(
deaths = sum(deaths, na.rm = TRUE),
Population = sum(Population, na.rm = TRUE),
.groups = "drop"
) %>%
mutate(deaths_per_million = (deaths / Population) * 1e6) %>%
left_join(state_abbrev, by = "Province_State") %>%
filter(!is.na(state_abbr))
#Make interactive choropleth
plot_us <- plot_geo(us_states_latest, locationmode = "USA-states") %>%
add_trace(
z = ~deaths_per_million,
locations = ~state_abbr,
text = ~paste0(
Province_State,
"<br>Deaths per Million: ", round(deaths_per_million, 1),
"<br>Total Deaths: ", scales::comma(deaths),
"<br>Population: ", scales::comma(Population)
),
hoverinfo = "text",
color = ~deaths_per_million,
colors = "Blues"
) %>%
colorbar(title = "Deaths per Million") %>%
layout(title = paste("COVID-19 Deaths per Million (", latest_date, ")", sep = ""),
geo = list(scope = "usa"))
plot_us